# libraries for basic operations
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
from sklearn.cluster import KMeans
# plotly offline for visualization
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
# Cross validation to validate our model and xgboost for model creation
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
pyoff.init_notebook_mode()
df_data = pd.read_csv('churn_data.csv')
df_data.head(10)
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
| 5 | 9305-CDSKC | Female | 0 | No | No | 8 | Yes | Yes | Fiber optic | No | ... | Yes | No | Yes | Yes | Month-to-month | Yes | Electronic check | 99.65 | 820.5 | Yes |
| 6 | 1452-KIOVK | Male | 0 | No | Yes | 22 | Yes | Yes | Fiber optic | No | ... | No | No | Yes | No | Month-to-month | Yes | Credit card (automatic) | 89.10 | 1949.4 | No |
| 7 | 6713-OKOMC | Female | 0 | No | No | 10 | No | No phone service | DSL | Yes | ... | No | No | No | No | Month-to-month | No | Mailed check | 29.75 | 301.9 | No |
| 8 | 7892-POOKP | Female | 0 | Yes | No | 28 | Yes | Yes | Fiber optic | No | ... | Yes | Yes | Yes | Yes | Month-to-month | Yes | Electronic check | 104.80 | 3046.05 | Yes |
| 9 | 6388-TABGU | Male | 0 | No | Yes | 62 | Yes | No | DSL | Yes | ... | No | No | No | No | One year | No | Bank transfer (automatic) | 56.15 | 3487.95 | No |
10 rows × 21 columns
Print basic info of the dataset
df_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.1+ MB
# Covert churn from no and yes to 0 and 1 respectively and convert data type to integer
df_data.loc[df_data.Churn=='No','Churn'] = 0
df_data.loc[df_data.Churn=='Yes','Churn'] = 1
df_data = df_data.astype({"Churn": int})
df_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null int32 dtypes: float64(1), int32(1), int64(2), object(17) memory usage: 1.1+ MB
df_plot = df_data.groupby('gender').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['gender'],
y=df_plot['Churn'],
width = [0.5, 0.5],
marker=dict(
color=['green', 'blue'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
yaxis={"title": "Churn Rate"},
title='Gender',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_plot = df_data.groupby('Partner').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['Partner'],
y=df_plot['Churn'],
width = [0.5, 0.5],
marker=dict(
color=['green', 'blue'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
yaxis={"title": "Churn Rate"},
title='Partner',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_plot = df_data.groupby('PhoneService').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['PhoneService'],
y=df_plot['Churn'],
width = [0.5, 0.5],
marker=dict(
color=['green', 'blue'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
yaxis={"title": "Churn Rate"},
title='Phone Service',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_plot = df_data.groupby('MultipleLines').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['MultipleLines'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Multiple Lines',
yaxis={"title": "Churn Rate"},
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_plot = df_data.groupby('InternetService').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['InternetService'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Internet Service',
yaxis={"title": "Churn Rate"},
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
This chart reveals customers who have Fiber optic as Internet Service are more likely to churn. I normally expect Fiber optic customers to churn less due to they use a more premium service. But this can happen due to high prices, competition, customer service, and many other reasons.
df_plot = df_data.groupby('OnlineSecurity').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['OnlineSecurity'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
yaxis={"title": "Churn Rate"},
title='Online Security',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
It is clear that people with no internet service will not worry about online security. We can observ the significant difference in churning rate between customers with and without security. This can be high priority feature in our analysis.
df_plot = df_data.groupby('OnlineBackup').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['OnlineBackup'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Online Backup',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_plot = df_data.groupby('DeviceProtection').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['DeviceProtection'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Device Protection',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_plot = df_data.groupby('TechSupport').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['TechSupport'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Tech Support',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_plot = df_data.groupby('StreamingTV').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['StreamingTV'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Streaming TV',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_plot = df_data.groupby('StreamingMovies').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['StreamingMovies'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Streaming Movies',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_plot = df_data.groupby('Contract').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['Contract'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Contract',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
It is clear that shorter the contract higher the churn rate
df_plot = df_data.groupby('PaperlessBilling').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['PaperlessBilling'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Paperless Billing',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Surprisingly customers with e - billing (paperless billing) are tends to churn more than customer having bills on paper
df_plot = df_data.groupby('PaymentMethod').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['PaymentMethod'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5,0.5],
marker=dict(
color=['green', 'blue', 'orange','red'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Payment Method',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
People paying with e-check have signifiacntly higher churning rate than other methods.
df_data.tenure.describe()
count 7043.000000 mean 32.371149 std 24.559481 min 0.000000 25% 9.000000 50% 29.000000 75% 55.000000 max 72.000000 Name: tenure, dtype: float64
df_plot = df_data.groupby('tenure').Churn.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['tenure'],
y=df_plot['Churn'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "Churn Rate"},
xaxis= {'title': "Tenure"},
title='Tenure based Churn rate',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
We can see a clear trend here, higher the tenure lower the churning rate.
df_plot = df_data.groupby('MonthlyCharges').Churn.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['MonthlyCharges'],
y=df_plot['Churn'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "Churn Rate"},
xaxis= {'title': "MonthlyCharges"},
title='Monthly Charges based Churn rate',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Unfortunately there in no trend between monthly charges and churn
df_plot = df_data.groupby('TotalCharges').Churn.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['TotalCharges'],
y=df_plot['Churn'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "Churn Rate"},
xaxis= {'title': "TotalCharges"},
title='Total Charges based Churn rate',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Same as monthly charges, there is no trend between Churn and Total charges
We are going to apply the following steps to create groups:
# order_cluster function will be used to add cluster to the data frame, function will be used for all 3 columns
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
import warnings
warnings.filterwarnings('ignore')
sse={}
df_cluster = df_data[['tenure']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_cluster)
df_cluster["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
By using elbow method we can see that 3 or 4 are optimal numbers for clustering tenure. We will take 3 for this particuler problem.
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_data[['tenure']])
df_data['TenureCluster'] = kmeans.predict(df_data[['tenure']])
df_data = order_cluster('TenureCluster', 'tenure',df_data,True)
df_data.groupby('TenureCluster').tenure.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| TenureCluster | ||||||||
| 0 | 2878.0 | 7.512509 | 5.977337 | 0.0 | 2.0 | 6.0 | 12.0 | 20.0 |
| 1 | 1926.0 | 33.854102 | 8.208706 | 21.0 | 26.0 | 34.0 | 41.0 | 48.0 |
| 2 | 2239.0 | 63.048682 | 7.478229 | 49.0 | 56.0 | 64.0 | 70.0 | 72.0 |
df_data['TenureCluster'] = df_data["TenureCluster"].replace({0:'Low',1:'Mid',2:'High'})
df_plot = df_data.groupby('TenureCluster').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['TenureCluster'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5,0.5],
marker=dict(
color=['green', 'blue', 'orange','red'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category","categoryarray":['Low','Mid','High']},
title='Tenure Cluster vs Churn Rate',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
We can see that we have done a good job clustering the Tenure. It follows same trend. Higher the Tenure, lower the Churn rate.
sse={}
df_cluster = df_data[['MonthlyCharges']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_cluster)
df_cluster["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
# Clustering
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_data[['MonthlyCharges']])
df_data['MonthlyChargeCluster'] = kmeans.predict(df_data[['MonthlyCharges']])
#Adding to Dataframe
df_data = order_cluster('MonthlyChargeCluster', 'MonthlyCharges',df_data,True)
#Describing Destribution
df_data.groupby('MonthlyChargeCluster').MonthlyCharges.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| MonthlyChargeCluster | ||||||||
| 0 | 1892.0 | 23.384619 | 5.660437 | 18.25 | 19.80 | 20.40 | 25.0500 | 42.40 |
| 1 | 2239.0 | 61.628808 | 10.441432 | 42.60 | 51.80 | 61.55 | 70.7000 | 77.80 |
| 2 | 2912.0 | 94.054258 | 10.343944 | 77.85 | 85.05 | 93.90 | 101.9125 | 118.75 |
# Renaming Clusters to low medium high
df_data['MonthlyChargeCluster'] = df_data["MonthlyChargeCluster"].replace({0:'Low',1:'Mid',2:'High'})
df_plot = df_data.groupby('MonthlyChargeCluster').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['MonthlyChargeCluster'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category","categoryarray":['Low','Mid','High']},
title='Monthly Charge Cluster vs Churn Rate',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Here, we can see the importance of clustering. There was no clear trend during EDA, when we observed Scatter plot. After clustering we can see a significant difference in Churn rate for each cluster.
# Handling Null Values and errors
df_data.loc[pd.to_numeric(df_data['TotalCharges'], errors='coerce').isnull(),'TotalCharges'] = np.nan
df_data = df_data.dropna()
#converting to numeric
df_data['TotalCharges'] = pd.to_numeric(df_data['TotalCharges'], errors='coerce')
#Elbow
sse={}
df_cluster = df_data[['TotalCharges']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_cluster)
df_cluster["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
# Clustering
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_data[['TotalCharges']])
df_data['TotalChargeCluster'] = kmeans.predict(df_data[['TotalCharges']])
#Adding to dataframe
df_data = order_cluster('TotalChargeCluster', 'TotalCharges',df_data,True)
# Describing cluster
df_data.groupby('TotalChargeCluster').TotalCharges.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| TotalChargeCluster | ||||||||
| 0 | 4151.0 | 683.417586 | 569.502364 | 18.8 | 161.3000 | 533.90 | 1133.8000 | 1961.6 |
| 1 | 1614.0 | 3257.957342 | 812.791757 | 1964.6 | 2538.0875 | 3198.80 | 3958.2375 | 4754.3 |
| 2 | 1267.0 | 6283.314246 | 1007.515610 | 4758.8 | 5456.1000 | 6141.65 | 7037.3250 | 8684.8 |
# Renaming Clusters
df_data['TotalChargeCluster'] = df_data["TotalChargeCluster"].replace({0:'Low',1:'Mid',2:'High'})
df_plot = df_data.groupby('TotalChargeCluster').Churn.mean().reset_index()
plot_data = [
go.Bar(
x=df_plot['TotalChargeCluster'],
y=df_plot['Churn'],
width = [0.5, 0.5, 0.5],
marker=dict(
color=['green', 'blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category","categoryarray":['Low','Mid','High']},
title='Total Charge Cluster vs Churn Rate',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Now we can see some trend here. Surprisingly, customer with higher total charges has lower churn rate. That can be because of extra and better services and tech support.
#import Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dummy_columns = [] #array for multiple value columns
for column in df_data.columns:
if df_data[column].dtype == object and column != 'customerID':
if df_data[column].nunique() == 2:
#apply Label Encoder for binary ones
df_data[column] = le.fit_transform(df_data[column])
else:
dummy_columns.append(column)
#apply get dummies for selected columns
df_data = pd.get_dummies(data = df_data,columns = dummy_columns)
all_columns = []
for column in df_data.columns:
column = column.replace(" ", "_").replace("(", "_").replace(")", "_").replace("-", "_")
all_columns.append(column)
df_data.columns = all_columns
glm_columns = 'gender'
for column in df_data.columns:
if column not in ['Churn','customerID','gender']:
glm_columns = glm_columns + ' + ' + column
import statsmodels.api as sm
import statsmodels.formula.api as smf
glm_model = smf.glm(formula='Churn ~ {}'.format(glm_columns), data=df_data, family=sm.families.Binomial())
res = glm_model.fit()
print(res.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: Churn No. Observations: 7032
Model: GLM Df Residuals: 7002
Model Family: Binomial Df Model: 29
Link Function: logit Scale: 1.0000
Method: IRLS Log-Likelihood: -2899.5
Date: Sun, 09 May 2021 Deviance: 5799.1
Time: 15:29:45 Pearson chi2: 7.55e+03
No. Iterations: 100
Covariance Type: nonrobust
===========================================================================================================
coef std err z P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------------------------------
Intercept 0.2510 0.277 0.907 0.364 -0.291 0.793
gender -0.0251 0.065 -0.386 0.699 -0.153 0.102
SeniorCitizen 0.2254 0.085 2.657 0.008 0.059 0.392
Partner 0.0019 0.078 0.025 0.980 -0.151 0.155
Dependents -0.1338 0.090 -1.485 0.137 -0.310 0.043
tenure -0.0617 0.008 -7.292 0.000 -0.078 -0.045
PhoneService 0.2262 0.403 0.561 0.575 -0.564 1.017
PaperlessBilling 0.3484 0.075 4.656 0.000 0.202 0.495
MonthlyCharges -0.0336 0.032 -1.053 0.292 -0.096 0.029
TotalCharges 9.941e-05 9.98e-05 0.996 0.319 -9.62e-05 0.000
MultipleLines_No -0.1163 0.130 -0.895 0.371 -0.371 0.138
MultipleLines_No_phone_service 0.0248 0.160 0.155 0.877 -0.289 0.339
MultipleLines_Yes 0.3425 0.283 1.209 0.227 -0.213 0.898
InternetService_DSL -0.5974 0.226 -2.642 0.008 -1.041 -0.154
InternetService_Fiber_optic 1.0432 0.578 1.805 0.071 -0.090 2.176
InternetService_No -0.1949 0.091 -2.135 0.033 -0.374 -0.016
OnlineSecurity_No 0.3252 0.108 3.005 0.003 0.113 0.537
OnlineSecurity_No_internet_service -0.1949 0.091 -2.135 0.033 -0.374 -0.016
OnlineSecurity_Yes 0.1207 0.261 0.462 0.644 -0.391 0.633
OnlineBackup_No 0.2208 0.107 2.066 0.039 0.011 0.430
OnlineBackup_No_internet_service -0.1949 0.091 -2.135 0.033 -0.374 -0.016
OnlineBackup_Yes 0.2251 0.261 0.863 0.388 -0.286 0.736
DeviceProtection_No 0.1460 0.107 1.359 0.174 -0.065 0.356
DeviceProtection_No_internet_service -0.1949 0.091 -2.135 0.033 -0.374 -0.016
DeviceProtection_Yes 0.2999 0.261 1.150 0.250 -0.211 0.811
TechSupport_No 0.3113 0.108 2.884 0.004 0.100 0.523
TechSupport_No_internet_service -0.1949 0.091 -2.135 0.033 -0.374 -0.016
TechSupport_Yes 0.1346 0.262 0.514 0.607 -0.379 0.648
StreamingTV_No -0.0585 0.048 -1.215 0.224 -0.153 0.036
StreamingTV_No_internet_service -0.1949 0.091 -2.135 0.033 -0.374 -0.016
StreamingTV_Yes 0.5044 0.340 1.484 0.138 -0.162 1.170
StreamingMovies_No -0.0570 0.049 -1.172 0.241 -0.152 0.038
StreamingMovies_No_internet_service -0.1949 0.091 -2.135 0.033 -0.374 -0.016
StreamingMovies_Yes 0.5028 0.340 1.480 0.139 -0.163 1.169
Contract_Month_to_month 0.7689 0.118 6.518 0.000 0.538 1.000
Contract_One_year 0.0917 0.121 0.758 0.448 -0.145 0.329
Contract_Two_year -0.6096 0.149 -4.096 0.000 -0.901 -0.318
PaymentMethod_Bank_transfer__automatic_ 0.0324 0.097 0.332 0.740 -0.158 0.223
PaymentMethod_Credit_card__automatic_ -0.0551 0.099 -0.559 0.576 -0.249 0.138
PaymentMethod_Electronic_check 0.3216 0.087 3.713 0.000 0.152 0.491
PaymentMethod_Mailed_check -0.0478 0.097 -0.492 0.623 -0.238 0.143
TenureCluster_High 0.4942 0.189 2.617 0.009 0.124 0.864
TenureCluster_Low -0.0894 0.171 -0.522 0.602 -0.425 0.246
TenureCluster_Mid -0.1538 0.119 -1.291 0.197 -0.387 0.080
MonthlyChargeCluster_High 0.0622 0.169 0.368 0.713 -0.269 0.394
MonthlyChargeCluster_Low 0.1103 0.195 0.565 0.572 -0.272 0.493
MonthlyChargeCluster_Mid 0.0785 0.127 0.618 0.537 -0.171 0.328
TotalChargeCluster_High 0.4344 0.206 2.105 0.035 0.030 0.839
TotalChargeCluster_Low -0.3447 0.177 -1.951 0.051 -0.691 0.002
TotalChargeCluster_Mid 0.1613 0.122 1.321 0.186 -0.078 0.400
===========================================================================================================
We can see all the generated dummy variables here
We have two important outcomes from this report. When you prepare a Churn Prediction model, you will face with the questions below:
For the first question, you should look at the 4th column (P>|z|). If the absolute p-value is smaller than 0.05, it means, that feature affects Churn in a statistically significant way. Examples are:
the second question. We want to reduce the Churn Rate, where we should start? The scientific version of this question is; Which feature will bring the best ROI if I increase/decrease it by one unit?
That question can be answered by looking at the coef column. Exponential coef gives us the expected change in Churn Rate if we change it by one unit. If we apply the code below, we will see the transformed version of all coefficients:
np.exp(res.params)
Intercept 1.285303 gender 0.975164 SeniorCitizen 1.252831 Partner 1.001918 Dependents 0.874745 tenure 0.940196 PhoneService 1.253830 PaperlessBilling 1.416844 MonthlyCharges 0.966977 TotalCharges 1.000099 MultipleLines_No 0.890239 MultipleLines_No_phone_service 1.025102 MultipleLines_Yes 1.408420 InternetService_DSL 0.550250 InternetService_Fiber_optic 2.838417 InternetService_No 0.822942 OnlineSecurity_No 1.384300 OnlineSecurity_No_internet_service 0.822942 OnlineSecurity_Yes 1.128252 OnlineBackup_No 1.247084 OnlineBackup_No_internet_service 0.822942 OnlineBackup_Yes 1.252393 DeviceProtection_No 1.157155 DeviceProtection_No_internet_service 0.822942 DeviceProtection_Yes 1.349724 TechSupport_No 1.365169 TechSupport_No_internet_service 0.822942 TechSupport_Yes 1.144063 StreamingTV_No 0.943170 StreamingTV_No_internet_service 0.822942 StreamingTV_Yes 1.655947 StreamingMovies_No 0.944619 StreamingMovies_No_internet_service 0.822942 StreamingMovies_Yes 1.653406 Contract_Month_to_month 2.157297 Contract_One_year 1.096050 Contract_Two_year 0.543582 PaymentMethod_Bank_transfer__automatic_ 1.032903 PaymentMethod_Credit_card__automatic_ 0.946352 PaymentMethod_Electronic_check 1.379272 PaymentMethod_Mailed_check 0.953330 TenureCluster_High 1.639232 TenureCluster_Low 0.914453 TenureCluster_Mid 0.857440 MonthlyChargeCluster_High 1.064173 MonthlyChargeCluster_Low 1.116620 MonthlyChargeCluster_Mid 1.081653 TotalChargeCluster_High 1.544099 TotalChargeCluster_Low 0.708423 TotalChargeCluster_Mid 1.174999 dtype: float64
As an example, one unit change in Monthly Charge means ~3.4% improvement in the odds for churning if we keep everything else constant. From the table above, we can quickly identify which features are more important.
change = 1 - exp coef
#create feature set and labels
X = df_data.drop(['Churn','customerID'],axis=1)
y = df_data.Churn
#train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=3)
#building the model
xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1).fit(X_train, y_train)
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
.format(xgb_model.score(X_test[X_train.columns], y_test)))
[15:49:57] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Accuracy of XGB classifier on training set: 0.84 Accuracy of XGB classifier on test set: 0.81
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.86 0.90 0.88 272
1 0.59 0.50 0.54 80
accuracy 0.81 352
macro avg 0.72 0.70 0.71 352
weighted avg 0.80 0.81 0.80 352
We can interpret the report above as if our model tells us, 100 customers will churn, 59 of it will churn (0.59 precision). And actually, there are around 226 customers who will churn (0.50 recall). Especially recall is the main problem here.
let’s see how our model works in detail. First off, we want to know which features our model exactly used from the dataset. Also, which were the most important ones?
from xgboost import plot_importance
fig, ax = plt.subplots(figsize=(10,8))
plot_importance(xgb_model, ax=ax)
<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>
Finally, the best way to use this model is assigning Churn Probability for each customer, create segments, and build strategies on top of that.
df_data['proba'] = xgb_model.predict_proba(df_data[X_train.columns])[:,1]
df_data[['customerID', 'proba']].head()
| customerID | proba | |
|---|---|---|
| 0 | 7590-VHVEG | 0.639258 |
| 1 | 6713-OKOMC | 0.180708 |
| 2 | 7469-LKBCI | 0.016671 |
| 3 | 8779-QRDMV | 0.907474 |
| 4 | 1680-VDCWW | 0.038562 |